In [2]:
#import libraries
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
from sklearn.cluster import KMeans
In [3]:
from __future__ import division
In [4]:
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.offline as pyoff
In [5]:
pip install xgboost
Requirement already satisfied: xgboost in ./opt/anaconda3/lib/python3.9/site-packages (1.6.2)
Requirement already satisfied: scipy in ./opt/anaconda3/lib/python3.9/site-packages (from xgboost) (1.7.3)
Requirement already satisfied: numpy in ./opt/anaconda3/lib/python3.9/site-packages (from xgboost) (1.21.5)
Note: you may need to restart the kernel to use updated packages.
In [6]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

import xgboost as xgb
In [7]:
pyoff.init_notebook_mode()
In [8]:
tx_data = pd.read_csv('/Users/hassaniftikhar4472/Downloads/data/LTV.csv')
In [10]:
tx_user = tx_data
In [11]:
tx_cluster = tx_user
In [12]:
tx_cluster['User_ID'] = pd.Series(range(1,tx_cluster.shape[0]))
In [13]:
one_hot_encoded_data = pd.get_dummies(tx_cluster, columns = ['Segment'])
In [14]:
tx_class = one_hot_encoded_data 
In [15]:
corr_matrix = tx_class.corr()
corr_matrix['LTVCluster'].sort_values(ascending=False)
Out[15]:
LTVCluster            1.000000
m6_Revenue            0.678410
Frequency             0.403291
FrequencyCluster      0.379138
Segment_High-Value    0.373549
OverallScore          0.353104
Revenue               0.234242
RecencyCluster        0.227060
RevenueCluster        0.188523
Segment_Mid-Value     0.027161
User_ID              -0.050461
Recency              -0.243686
Segment_Low-Value    -0.249520
Name: LTVCluster, dtype: float64
In [16]:
X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
y = tx_class['LTVCluster']
In [17]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
In [18]:
import xgboost as xgb
In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=56)
In [20]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

import xgboost as xgb
from xgboost import XGBRegressor
In [21]:
ltv_xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1,objective= 'multi:softprob',n_jobs=-1).fit(X_train, y_train, verbose =2)

print('Accuracy of XGB classifier on training set: {:.2f}'
       .format(ltv_xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
       .format(ltv_xgb_model.score(X_test[X_train.columns], y_test)))

y_pred = ltv_xgb_model.predict(X_test)
Accuracy of XGB classifier on training set: 0.94
Accuracy of XGB classifier on test set: 0.94
In [24]:
tx_class
Out[24]:
User_ID Recency RecencyCluster Frequency FrequencyCluster Revenue RevenueCluster OverallScore m6_Revenue LTVCluster Segment_High-Value Segment_Low-Value Segment_Mid-Value
0 1.0 0 3 9 1 0.287363 0 4 0.452209 2 0 0 1
1 2.0 0 3 6 1 0.028388 0 4 0.142762 2 0 0 1
2 3.0 0 3 8 1 0.014870 0 4 0.004203 0 0 0 1
3 4.0 1 3 6 1 0.012853 0 4 0.002923 0 0 0 1
4 5.0 3 3 8 1 0.005576 0 4 0.000000 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
5604127 5604128.0 13 2 15 2 36.428239 3 7 0.000000 0 1 0 0
5604128 5604129.0 13 2 12 2 28.100487 3 7 0.000000 0 1 0 0
5604129 5604130.0 12 2 16 2 27.944217 3 7 0.000000 0 1 0 0
5604130 5604131.0 7 2 18 2 24.543765 3 7 0.468071 2 1 0 0
5604131 NaN 11 2 13 2 17.675474 3 7 0.000000 0 1 0 0

5604132 rows × 13 columns

In [25]:
tx_class.groupby('LTVCluster').User_ID.count()/tx_class.User_ID.count()
Out[25]:
LTVCluster
0    0.937227
1    0.015655
2    0.047117
Name: User_ID, dtype: float64
In [26]:
y_pred = ltv_xgb_model.predict(X_test)
In [28]:
print (classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.96      0.99      0.97    262637
           1       0.48      0.23      0.31      4405
           2       0.48      0.22      0.30     13165

    accuracy                           0.94    280207
   macro avg       0.64      0.48      0.53    280207
weighted avg       0.93      0.94      0.93    280207

In [35]:
X_test
Out[35]:
User_ID Recency RecencyCluster Frequency FrequencyCluster Revenue RevenueCluster OverallScore Segment_High-Value Segment_Low-Value Segment_Mid-Value
254700 254701.0 0 3 7 1 0.458909 0 4 0 0 1
958295 958296.0 0 3 26 3 0.112509 0 6 1 0 0
4794539 4794540.0 27 0 1 0 0.000000 0 0 0 1 0
3469837 3469838.0 13 2 1 0 0.000000 0 2 0 1 0
4177086 4177087.0 7 2 1 0 0.042354 0 2 0 1 0
... ... ... ... ... ... ... ... ... ... ... ...
4825853 4825854.0 27 0 1 0 0.000720 0 0 0 1 0
3460000 3460001.0 13 2 1 0 0.000000 0 2 0 1 0
3893039 3893040.0 10 2 1 0 0.000000 0 2 0 1 0
2787908 2787909.0 17 1 2 0 0.014556 0 1 0 1 0
1115561 1115562.0 0 3 2 0 0.003920 0 3 0 0 1

280207 rows × 11 columns

In [32]:
y_test
Out[32]:
254700     2
958295     0
4794539    0
3469837    0
4177086    0
          ..
4825853    0
3460000    0
3893039    0
2787908    0
1115561    0
Name: LTVCluster, Length: 280207, dtype: int64
In [40]:
X
Out[40]:
User_ID Recency RecencyCluster Frequency FrequencyCluster Revenue RevenueCluster OverallScore Segment_High-Value Segment_Low-Value Segment_Mid-Value
0 1.0 0 3 9 1 0.287363 0 4 0 0 1
1 2.0 0 3 6 1 0.028388 0 4 0 0 1
2 3.0 0 3 8 1 0.014870 0 4 0 0 1
3 4.0 1 3 6 1 0.012853 0 4 0 0 1
4 5.0 3 3 8 1 0.005576 0 4 0 0 1
... ... ... ... ... ... ... ... ... ... ... ...
5604127 5604128.0 13 2 15 2 36.428239 3 7 1 0 0
5604128 5604129.0 13 2 12 2 28.100487 3 7 1 0 0
5604129 5604130.0 12 2 16 2 27.944217 3 7 1 0 0
5604130 5604131.0 7 2 18 2 24.543765 3 7 1 0 0
5604131 NaN 11 2 13 2 17.675474 3 7 1 0 0

5604132 rows × 11 columns

In [ ]: